###############################################
#                                             #
# Value of Precision                          #
# Scripts for Data Analysis                   #
# Last Updated: 7/19/17                       #
#                                             #
# Josh Baker                                  #
# jbak@sas.upenn.edu                          #
#                                             #
###############################################


###########################################################################################
#                                 A Note on Functions                                     #
###########################################################################################

## Many of the analyses included in our manuscript require purpose-built functions to execute.
## For the sake of clarity, these functions have been included in a separate document called
## "Value.of.Precision_Functions_7.19.17".
##
## PRIOR TO RUNNING ANY OF THE CODE IN THIS DOCUMENT, PLEASE LOAD ALL FUNCTIONS FROM
## "Value.of.Precision_Functions_7.19.17" INTO YOUR WORKSPACE.

###########################################################################################
#                     Install and Load the Package "data.table"                           #
###########################################################################################
install.packages("data.table")
library(data.table)

###########################################################################################
#                              Import and Prepare Data                                    #
###########################################################################################

# Import Raw Data from GJP
fcasts <- read.csv("fcasts.csv")

# Change data types for some variables to make them more useful for later analyses
fcasts$ifp_id <- as.character(fcasts$ifp_id)
fcasts$answer_option <- as.character(fcasts$answer_option)
fcasts$date <- as.character(fcasts$date)
fcasts$date.closed <- as.character(fcasts$date.closed)
fcasts$timestamp <- as.character(fcasts$timestamp)
fcasts$outcome <- as.character(fcasts$outcome)
fcasts$g.tnt <- as.character(fcasts$g.tnt)
fcasts$mod.tag <- as.character(fcasts$mod.tag)
fcasts$time.horiz <- as.character(fcasts$time.horiz)

###########################################################################################
#                                  Variable Key                                           #
###########################################################################################

## ifp_id: a numerical identifier for forecasting question (note: IFP stands for "Individual Forecasting Problem")
## user_id: a numerical identifier for subjects
## answer_option: indicates whether the forecast in question represents the probability of the event occurring ("a") or not occurring ("b")
## date: date the forecast was made
## year: torunament year during which the forecast was made
## timestamp: timestamp from when forecast was made
## outcome: observed outcome for event in question (cf. "answer_option")
## g.tnt: comparison group identifier. Stands for "group, training/no training." Six levels.
##    "all.fc" = all forecasters;
##    "i.t" = individuals w/training;
##    "i.nt" = individuals w/o training
##    "g.t" = non-superforecaster groups w/training;
##    "g.nt" = non-superforecaster groups w/o training;
##    "s.t" = superfoecasters, all of who were trained and worked in groups.
## mod.tag: indicator variable designating how fine-grained the forecast was. Six levels.
##    "mod.0" = forecast of exactly 0.00
##    "mod.50" = forecast of exactly 0.50
##    "mod.1.0" = forecast of exactly 1.0
##    "mod.10" = forecast is multiple of 0.10, excluding all categories above (stands for modulus 0.10)
##    "mod.05" = forecast is multiple of 0.05, excluding all categories above (stands for modulus 0.05)
##    "mod.01" = forecast is multiple of 0.01, excluding all categories above (stands for modulus 0.01)
## date.closed: date on which the ifp in question closed to forecasters (note: generally ifps closed on a specified date, but some closed early when resolution criteria were met)
## t.minus_days: the time elapsed between forecast and ifp resolution, expressed in days.
## time.horiz.code: numerical indicator denoting which time horizon category the associated forecast falls under (see manuscript).
## time.horiz: plain English indicator denoting which time horizon category the associated forecast falls under (see manuscript)
## wep: numerical indicator denoting which wep bin the associated forecast falls within (equal sized bins, B = 7, see manuscript)
## nic.bin: numerical indicatr denoting which nic bin the associated forecast falls within (see manuscript)
## val.unrounded: stated forecast of the associated answer_option occurring (zero to one, two decimal places)
##    val.2: stated forecast, rounded to the nearest midpoint of 2 equal sized bins (B = 2)
##    val.3: stated forecast, rounded to the nearest midpoint of 3 equal sized bins (B = 3)
##    val.5: stated forecast, rounded to the nearest midpoint of 5 equal sized bins (B = 5)
##    val.7: stated forecast, rounded to the nearest midpoint of 7 equal sized bins (B = 7)
## val.nic.mid: stated forecast, rounded to the nearest midpoint of the NIC bins (see manuscript)
## val.nic.alt.mid: slight variation on val.nic.mid (slightly different NIC scoring). Not used in manuscript
## val.ev.equi: stated forecast, rounded to the empirical expected value (frequency-weighted mean) of the equal-sized wep bin (B = 7) within which it falls
## val.ev.nic: stated forecast, rounded to the empirical expected value (frequency-weighted mean) of the NIC bin whithin which it falls
## bs.unrounded: brier score associated with val.unrounded
##    bs.2: brier score associated with val.2
##    bs.3: brier score associated with val.3
##    bs.5: brier score associated with val.5
##    bs.6: brier score associated with val.7
##    bs.nic.mid: brier score associated with val.nic.mid
##    bs.nic.alt.mid: brier score associated with val.nic.alt.mid
##    bs.ev.equi: brier score associated with val.ev.equi
##    bs.ev.nic: brier score associated with val.ev.nic
## log.unrounded: logarithmic score associated with val.unrounded
##    log.2: logarithmic score associated with val.2
##    log.3: logarithmic score associated with val.3
##    log.5: logarithmic score associated with val.5
##    log.6: logarithmic score associated with val.7
##    log.nic.mid: logarithmic score associated with val.nic.mid
##    log.nic.alt.mid: logarithmic score associated with val.nic.alt.mid
##    log.ev.equi: logarithmic score associated with val.ev.equi
##    log.ev.nic: logarithmic score associated with val.ev.nic



###########################################################################################
#                     Some notes about the data table "fcasts"                            #
###########################################################################################

## For the sake of coherence with our manuscript, the data-set accompanying this code ("fcasts")
## is identical to the data-set used in our analyses. Were one to re-create this table from
## scratch, the end result would be slightly different, due to the fact that we randomized the
## direction of rounding when forecasts landed on a "bin" boundary.
##
## Furthermore, "fcasts" is a very large data set. To save end-users from having to compute
## accuracy scores for themselves (a process that can be very time-consuming on certain computers),
## "fcasts" also includes all of the necessary information (e.g., meta-data, accuracy scores)
## to replicate our analyses right away.
##
## For the sake of transparency, however, the code necessary to compute these variables
## is included below.


###########################################################################################
#         Building "fcasts" from scratch (for informational purposes only)                #
###########################################################################################

# Duplicate fcasts and truncate to replicate original, minimal data set
fcasts.basic <- fcasts
fcasts.basic <- fcasts.basic[c(1:8,10,16)]

# Identify WEP bin for each forecast, add to fcasts.basic
fcasts.basic <- cbind(fcasts.basic, wep=sapply(fcasts.basic$val.unrounded, id.wep))

# Identify NIC bin for each forecast, add to fcasts.basic
fcasts.basic <- cbind(fcasts.basic, nic.bin=sapply(fcasts.basic$val.unrounded, id.nic))

# Identify mod.tag category for each forecast, add to fcasts.basic
fcasts.basic <- cbind(fcasts.basic, mod.tag=sapply(fcasts.basic$val.unrounded, id.mod.tag))

# Calculate t.minus_days, add to fcasts.basic
fcasts.basic <- cbind(fcasts.basic,t.minus_days=as.numeric(difftime(fcasts.basic$date, fcasts.basic$date.closed, unit="days")))

# Calculate time.horiz.code and time.horiz, and add to fcasts.basic
fcasts.basic <- cbind(fcasts.basic,time.horiz.tag(fcasts.basic))

# Round forecasts to midpoints of 2, 3, 5, and 7 equal sized bins, and add to fcasts.basic
fcasts.basic <- cbind(fcasts.basic, val.2=Round.To(fcasts.basic$val.unrounded,2))
fcasts.basic <- cbind(fcasts.basic, val.3=Round.To(fcasts.basic$val.unrounded,3))
fcasts.basic <- cbind(fcasts.basic, val.5=Round.To(fcasts.basic$val.unrounded,5))
fcasts.basic <- cbind(fcasts.basic, val.7=Round.To(fcasts.basic$val.unrounded,7))

# Round forecast to midpoints and expected values of NIC bins and equal-sized bins, add to fcasts.basic
fcasts.basic <- cbind(fcasts.basic, val.nic.mid=Round.To_NIC(fcasts.basic$val.unrounded))
fcasts.basic <- cbind(fcasts.basic, val.nic.alt.mid=Round.To_NIC.alt(fcasts.basic$val.unrounded))
fcasts.basic <- cbind(fcasts.basic, val.ev.equi=Round.To.EV(fcasts.basic$val.unrounded, 7))
fcasts.basic <- cbind(fcasts.basic, val.ev.nic=Round.To.EV_NIC.7(fcasts.basic))

# Re-arrange column order to match fcasts
fcasts.basic <- fcasts.basic[c(1:8,13,9,14,16,15,11,12,10,17:24)]

# Brier Score forecasts, add to fcasts.basic
rownames(fcasts.basic) <- seq.int(1:nrow(fcasts.basic)) # Ensure that rownames are in ascending order
fcasts.basic <- cbind(fcasts.basic, bs.unrounded=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.unrounded"))))
fcasts.basic <- cbind(fcasts.basic, bs.2=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.2"))))
fcasts.basic <- cbind(fcasts.basic, bs.3=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.3"))))
fcasts.basic <- cbind(fcasts.basic, bs.5=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.5"))))
fcasts.basic <- cbind(fcasts.basic, bs.7=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.7"))))
fcasts.basic <- cbind(fcasts.basic, bs.nic.mid=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.nic.mid"))))
fcasts.basic <- cbind(fcasts.basic, bs.nic.alt.mid=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.nic.alt.mid"))))
fcasts.basic <- cbind(fcasts.basic, bs.ev.equi=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.ev.equi"))))
fcasts.basic <- cbind(fcasts.basic, bs.ev.nic=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), bs, which(colnames(fcasts.basic)=="val.ev.nic"))))


# Log Score forecasts, add to fcasts.basic
rownames(fcasts.basic) <- seq.int(1:nrow(fcasts.basic)) # Ensure that rownames are in ascending order
fcasts.basic <- cbind(fcasts.basic, log.unrounded=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.unrounded"))))
fcasts.basic <- cbind(fcasts.basic, log.2=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.2"))))
fcasts.basic <- cbind(fcasts.basic, log.3=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.3"))))
fcasts.basic <- cbind(fcasts.basic, log.5=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.5"))))
fcasts.basic <- cbind(fcasts.basic, log.7=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.7"))))
fcasts.basic <- cbind(fcasts.basic, log.nic.mid=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.nic.mid"))))
fcasts.basic <- cbind(fcasts.basic, log.nic.alt.mid=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.nic.alt.mid"))))
fcasts.basic <- cbind(fcasts.basic, log.ev.equi=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.ev.equi"))))
fcasts.basic <- cbind(fcasts.basic, log.ev.nic=as.vector(by(fcasts.basic, as.numeric(rownames(fcasts.basic)), log.score, which(colnames(fcasts.basic)=="val.ev.nic"))))

# Final updates to data types
fcasts.basic$mod.tag <- as.character(fcasts.basic$mod.tag)
fcasts.basic$time.horiz <- as.character(fcasts.basic$time.horiz)


######## ONLY RUN THE NEXT LINE OF CODE IF YOU WOULD LIKE TO REMOVE fcasts.basic
## Note: you will need to delete the comment symbol (#) for this line to run correctly

#rm(fcasts.basic) 



###########################################################################################
#   Generate all "returns to precision" data in manuscript and supplementary materials    #
###########################################################################################

## PLEASE NOTE: Generating the following tables is extremely computationally demanding
## Depending on you computer, please expect each table to take several hours to run.
##
## If you can plan to leave your computer unattended while a batch of tables is generated,
## we strongly recommend that you designate a temporary file to save your progress after
## each step is completed. To do so, simply use the function "save.image()" after each of
## the following lines, with a designated save path as an argument to this function.
##
## e.g.
##      save.image("C:\Users\[user.name]\[Folder]\[Sub.Folder]...\[filename].RData")


# Brier Scoring
bs.comp.group.mean <- gen.tab_master(method="bs",type="comp.group",stat="mean")
save.image("C:\\Users\\joshb\\Documents\\A.MyStuff\\Penn\\Research\\Value.of.Precision\\SAFEKEEPING_Final.Analyses_for.ISQ\\ISQ.Analyses_7.19.17\\In.Out\\post.ar.RData")

bs.comp.group.median <- gen.tab_master(method="bs",type="comp.group",stat="median")

bs.time.horiz.mean <- gen.tab_master(method="bs",type="time.horiz",stat="mean")

bs.time.horiz.median <- gen.tab_master(method="bs",type="time.horiz",stat="median")

bs.wep.mean <- gen.tab_master(method="bs",type="wep",stat="mean")

bs.wep.median <- gen.tab_master(method="bs",type="wep",stat="median")

bs.nic.mean <- gen.tab_master(method="bs",type="nic",stat="mean")

bs.nic.median <- gen.tab_master(method="bs",type="nic",stat="median")


# Logarithmic Scoring
log.comp.group.mean <- gen.tab_master(method="log",type="comp.group",stat="mean")

log.comp.group.median <- gen.tab_master(method="log",type="comp.group",stat="median")

log.time.horiz.mean <- gen.tab_master(method="log",type="time.horiz",stat="mean")

log.time.horiz.median <- gen.tab_master(method="log",type="time.horiz",stat="median")

log.wep.mean <- gen.tab_master(method="log",type="wep",stat="mean")

log.wep.median <- gen.tab_master(method="log",type="wep",stat="median")

log.nic.mean <- gen.tab_master(method="log",type="nic",stat="mean")

log.nic.median <- gen.tab_master(method="log",type="nic",stat="median")
